load("org_2000_2024.RData")
# merge the state names using the "statefip" variable
url <- "https://github.com/wsundstrom/econ173/raw/data/statefips.dta"
states <- read_dta(url) %>%
select(statefip = statefips, state = statename, statecd)
org <- org %>%
left_join(states)
# Scrape the HTML tables on the following website using rvest package:
url <- "https://www.dol.gov/agencies/whd/state/minimum-wage/history"
# read in calculated values for percentage change
data <- read_excel("percentage_change_results.xlsx")
#### ^ this code was generated in Python to get percent changes for every occ2010
# Get the tables - this results in a list because there are multiple tables
minwage <- url %>%
read_html() %>%
html_nodes("table")
# the tables with the years we want (2000-) are the 3rd-6th
minwage3 <- minwage %>%
.[[3]] %>% # 3rd table
html_table()
colnames(minwage3)[1] <- "state"
minwage4 <- minwage %>%
.[[4]] %>%
html_table()
colnames(minwage4)[1] <- "state"
minwage5 <- minwage %>%
.[[5]] %>%
html_table()
colnames(minwage5)[1] <- "state"
minwage6 <- minwage %>%
.[[6]] %>%
html_table()
colnames(minwage6)[1] <- "state"
# minwageL and minwageU are the upper and lower values for states with a range
mintab <- minwage3 %>%
left_join(minwage4) %>%
left_join(minwage5) %>%
left_join(minwage6) %>%
pivot_longer(-state, names_to = "year", values_to = "minwage") %>%
separate(minwage, into = c("minwageL", "minwageU"), sep = "-", remove = F) %>%
mutate(year = as.numeric(gsub("[^0-9]+", "", year)),
minwageL = as.numeric(gsub("[^0-9.]+", "", minwageL, perl = TRUE)),
minwageU = ifelse(is.na(minwageU), minwageL, as.numeric(gsub("[^0-9.]+", "", minwageU, perl = TRUE))),
minwageL = ifelse(state=="Nevada" & year==2022, 9.50, minwageL),
minwageU = ifelse(state=="Nevada" & year==2022, 10.50, minwageU),
minwageL = ifelse(state=="Nevada" & year==2023, 10.25, minwageL),
minwageU = ifelse(state=="Nevada" & year==2023, 11.25, minwageU)) %>%
filter(!(state %in% c("Guam", "Puerto Rico", "U.S. Virgin Islands")))
# assign federal min if min is missing or fed > state min
fedmin <- mintab %>%
filter(state=="Federal (FLSA)") %>%
mutate(fedmin = minwageL) %>%
select(year, fedmin)
mintab <- mintab %>%
left_join(fedmin) %>%
mutate(minwageL = ifelse(is.na(minwageL) | fedmin>minwageL, fedmin, minwageL),
minwageU = ifelse(is.na(minwageU) | fedmin>minwageU, fedmin, minwageU)) %>%
filter(state!="Federal (FLSA)") %>%
select(-minwage)
org$hourwage <- ifelse(org$hourwage == 999.99, NA, org$hourwage)
org$hourwage2 <- ifelse(org$hourwage2 == 999.99, NA, org$hourwage2)
org$hourwage <- ifelse(org$hourwage == 999.99, NA, org$hourwage)
org$earnweek <- ifelse(org$earnweek == 9999.99, NA, org$earnweek)
org$earnweek2 <- ifelse(org$earnweek2 == 9999.99, NA, org$earnweek2)
org <- org %>%
mutate(educ_level = case_when(
educ == 2 ~ "None, preschool, or kingergarten",
educ == 10 ~ "Grades 1, 2, 3, or 4",
educ == 20 ~ "Grades 5 or 6",
educ == 30 ~ "Grades 7 or 8",
educ == 40 ~ "Grade 9",
educ == 50 ~ "Grade 10",
educ == 60 ~ "Grade 11",
educ == 71 ~ "12th grade, no diploma",
educ == 73 ~ "High school diploma or equivalent",
educ == 81 ~ "Some college but no degree",
educ == 91 ~ "Associate's degree, occupational/vocational",
educ == 92 ~ "Associate's degree, academic program",
educ == 111 ~ "Bachelor's degree",
educ == 123 ~ "Master's degree",
educ == 124 ~ "Professional school degree",
educ == 125 ~ "Doctorate degree",
TRUE ~ as.character(educ) # keeps any existing values not matched
))
org_minwage <- left_join(org, mintab, by = c("year", "state"))
org_minwage <- org_minwage %>%
group_by(year, occ2010, statecd) %>% # Group by year, occ2010, and statecd
mutate(mean_hourwage = mean(hourwage, na.rm = TRUE)) %>% # Calculate mean of hourwage
ungroup() # Ungroup to prevent grouped structure in the final dataframe
org_minwage$occ2010 <- ifelse(org_minwage$occ2010 == 9999, NA, org_minwage$occ2010)
Mutating org_minwage to calculate the mean hour wage for occupations dependent on year and state codes. While also removing NIU values for occ2010.
#### 2006-2010 recession
recession_period <- data %>%
filter(year >= 2004 & year <= 2010) %>%
filter(count > 500)
rec_period_loss <- recession_period %>%
filter(year >= 2007 & year <= 2010) %>%
filter(percentage_change <= -11.48)
rec_loss <- data %>%
filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
filter(year >= 2007 & year <= 2010)
rec_per_loss_y_over_y <- recession_period %>%
filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
filter(year >= 2007 & year <= 2010)
# Group by 'year' and calculate the mean of 'per_change' (or another aggregation)
avg_rec_per_loss <- recession_period %>%
group_by(year, occ2010, count) %>%
summarize(mean_per_change = mean(percentage_change, na.rm = TRUE))
Here the data is manipulated for specific observations of occ2010 that saw the largest overall percent decrease throughout the 2007-2010 recession. The counts are filtered to above 500 to minimize small differences between years that cause large percent changes. I.E. an occupation going from 3 observations to 2, which would be a -33.33% change.
ggplot(rec_loss, aes(x = year, y = percentage_change, fill = factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") + # Bar chart with dodge positioning
labs(
title = "Percentage Change by occ2010 Over Years",
x = "Years",
y = "Percentage Change",
fill = "occ2010"
) +
theme_minimal() + # Use a clean theme
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
avg_perc_2006 <- rec_loss %>%
group_by(occ2010) %>%
summarize(avg_percentage_change = mean(percentage_change))
ggplot(avg_perc_2006, aes(x = factor(occ2010), y = avg_percentage_change, fill = factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") + # Bar chart with dodge positioning
labs(
title = "Average Percentage Change by occ2010
Over 2007-2010 Recession",
x = "occ2010",
y = "Average Percentage Change",
fill = "occ2010"
) +
theme_minimal() + # Use a clean theme
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
occ_lookup_rec <- data.frame(
occ2010 = c(3800, 4010, 4920, 5810, 6230, 6420, 7340, 8220, 8740, 9100),
occupation_name = c(
"Sheriffs, Bailiffs, Correctional Officers, and Jailers",
"First-Line Supervisors of Food Preparation and Serving Workers",
"Real Estate Brokers and Sales Agents",
"Data Entry Keyers",
"Carpenters",
"Painters, Construction and Maintenance",
"Maintenance and Repair Workers, General",
"Metal Workers and Plastic Workers",
"Inspectors, Testers, Sorters, Samplers, and Weighers",
"Bus and Ambulance Drivers and Attendants"
),
stringsAsFactors = FALSE
)
occ_table_rec <- occ_lookup_rec %>%
select(occ2010, occupation_name) %>% # Select relevant columns
arrange(occ2010) # Sort by occ2010 if desired
# Create a styled HTML table
occ_table_rec %>%
kable("html", caption = "Occupation Codes and Names for 2007-2010 Recession") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)
| occ2010 | occupation_name |
|---|---|
| 3800 | Sheriffs, Bailiffs, Correctional Officers, and Jailers |
| 4010 | First-Line Supervisors of Food Preparation and Serving Workers |
| 4920 | Real Estate Brokers and Sales Agents |
| 5810 | Data Entry Keyers |
| 6230 | Carpenters |
| 6420 | Painters, Construction and Maintenance |
| 7340 | Maintenance and Repair Workers, General |
| 8220 | Metal Workers and Plastic Workers |
| 8740 | Inspectors, Testers, Sorters, Samplers, and Weighers |
| 9100 | Bus and Ambulance Drivers and Attendants |
The first bar chart, titled “Percentage Change by occ2010 Over Years” displays the percentage change for each occupation of interest throughout the years of interest. The largest value decreases are seen in 2008 and 2009. This graph is labeled as “Figure 3” in the Google Doc. The second bar chart, titled “Average Percentage Change by occ2010 Over 2007-2010 Recession” displays the average percentage change for the occupations of interest throughout the years of interest. The occupations which noticed that largest decreases wee 4920, 5810, and 8220. This graph is labeled as “Figure 4” in the Google Doc. The table titled, “Occupation Codes and Names for 2007-2010 Recession” displays the occupations of interest and lines up the codes with the occupation names. This is labeled as “Figure 5” in the Google Doc.
real_estate <- rec_loss %>%
filter(occ2010 %in% c(4920)) %>%
filter(year >= 2005 & year <= 2010)
ggplot(real_estate, aes(x = year, y = percentage_change, color = factor(occ2010), group = occ2010)) +
geom_line(size = 0.5) + # Add lines for each occ2010 group
labs(
title = "Percentage Change by occ2010 Over Years",
x = "Years",
y = "Percentage Change",
color = "occ2010"
) +
theme_minimal() + # Use a clean theme
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
I decided to plot real estate brokers on their own from 2007-2010 as I
noticed that in the 2007-2010 recession period that this was an
occupation that was found to have a high percentage loss. The graph
demonstrates the decrease in real estate brokers due to the housing
bubble bursting. This is shown as Figure 6 in the Google Doc.
### Scatter Plot to Show the Minimum Wage Affecting Occupation Losses
org_minwage_2007 <- org_minwage%>%
group_by(year, occ2010, statecd) %>%
filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) %>%
filter(year >= 2007 & year <= 2010) %>%
mutate(
within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
) %>%
select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
filter(!is.na(within_150pct)) %>%
filter(!is.na(hourwage))
org_minwage_2007_grouped <- org_minwage_2007 %>%
group_by(occ2010) %>% # Group by occ2010
summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop") # Calculate the mean
org_minwage_2007_plot <- org_minwage_2007_grouped %>%
left_join(rec_period_loss, by = "occ2010")
Throughout this chunk of code I am looking for ways to manipulate the data into a way that will be portrayed best graphically. I take the hourwage and determine that if it is within 150% of the upper bound of the minimum wage the observation is assigned a 1. I then group the full dataset by occ2010 to get the average of the binary variable, for example: if occ2010, 4920 has 4000 observations, and 3000 of these observations are within the binary range. Then the math is 3000/4000 = the binary percentage which in this case is 0.75.
####Scatter Plots for Occupations of Interest in 2007-2010
ggplot_base <- ggplot(org_minwage_2007_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) + # Adds hover text
labs(
title = "Scatter Plot of Percentage Change vs. Mean Within 150%\n(2007-2010 Period)",
x = "Mean Within 150%",
y = "Percentage Change",
color = "Occupation (occ2010)"
) +
scale_y_reverse() +
theme_minimal()
# Convert to interactive plotly plot
interactive_plot <- ggplotly(ggplot_base, tooltip = "text") # Tooltip displays the text defined in aes
# Display the plot
interactive_plot
fit_specific_rec <- lm(percentage_change ~ mean_within_150pct, data = org_minwage_2007_plot)
slope_specific_rec <- coef(fit_specific_rec)[2]
Specific_Rec_Reg <- list("Linear Model"=fit_specific_rec)
modelsummary(Specific_Rec_Reg,
coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
stars=T, gof_map = c("nobs", "r.squared"),
title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession for only Occupations of Interest",
output="default")
| Linear Model | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| Within 150% of Minimum Wage | 2.728 |
| (5.068) | |
| Num.Obs. | 10 |
| R2 | 0.035 |
ggplot_base <- ggplot(org_minwage_2007_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
geom_smooth(aes(text = paste("Slope:", round(slope_specific_rec, 4))),
method = "lm", se = FALSE, color = "black", linetype = "solid") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2007-2010) for occ2010 of Interest With Regression Line",
x = "Mean Within 150%",
y = "Percentage Change") +
scale_y_reverse() +
guides(color = "none") +
theme_minimal()
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot
The two graphs share the exact same data, however the reason for them being plotted twice is because the first graph shows the spread of the percentage change in a more visually appealing way. While the graph with the regression line shrinks the distance on the y-axis showing the points very close to one another. The regression in the modelsummary table provides information that the closer the occupation is to the minimum wage for these occupations of interest, the more positive the percentage change. These are Figures 7 and 8 in the Google Doc. The table associated is labeled as Table 1 in the Google Doc.
####Data Manipulation for Total Occupations During 2007-2010 Recession Period
all_occ_2007 <- org_minwage %>%
group_by(year, occ2010, statecd) %>%
filter(year >= 2007 & year <= 2010) %>%
mutate(
within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
) %>%
select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
filter(!is.na(within_150pct)) %>%
filter(!is.na(hourwage))
all_occ_2007_a <- all_occ_2007 %>%
group_by(occ2010, year) # Group by occ2010
all_occ_2007_a$occ2010 <- ifelse(all_occ_2007_a$occ2010 == 9999.99, NA, all_occ_2007_a$occ2010)
all_occ_2007_binary <- all_occ_2007 %>%
group_by(occ2010) %>%
summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop") # Calculate the mean
total_per_change_rec <- data %>%
filter(year >= 2007 & year <= 2010) %>% # Filter data by year range first
filter(count >= 500) %>%
group_by(occ2010) %>% # Group by year and occ2010
filter(percentage_change == min(percentage_change, na.rm = TRUE)) %>% # Keep rows with the smallest percentage_change
ungroup()
all_occ_2007_b <- all_occ_2007_binary %>%
left_join(total_per_change_rec %>% select(occ2010, percentage_change),
by = c("occ2010")) %>%
filter(!is.na(percentage_change))
The data is manipulated to create a mean for the binary variable of all occ2010 values that have counts over 500 for the specific years of interest. Then I filter for the lowest percent change for the occ2010 values for the years of interest. Then I join the data all together to have the percentage change and mean of the binary variable to give a percentage of the occ2010 near the minimum wage threshold.
fit_rec <- lm(percentage_change ~ mean_within_150pct, data = all_occ_2007_b)
slope_rec <- coef(fit_rec)[2]
Rec_Reg <- list("Linear Model"=fit_rec)
modelsummary(Rec_Reg,
coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
stars=T, gof_map = c("nobs", "r.squared"),
title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2007-2010 Recession",
output="default")
| Linear Model | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| Within 150% of Minimum Wage | 3.077 |
| (2.305) | |
| Num.Obs. | 112 |
| R2 | 0.016 |
ggplot_base <- ggplot(all_occ_2007_b, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
geom_smooth(aes(text = paste("Slope:", round(slope_rec, 4))),
method = "lm", se = FALSE, color = "black", linetype = "solid") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2007-2010)",
x = "Mean Within 150%",
y = "Percentage Change") +
scale_y_reverse() +
guides(color = "none") +
theme_minimal()
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot
The regression table shown portraying the linear model shows a non statistically significant value of 3.077, and is shown as “Table 2” in the Google doc. Between all the occupations of counts over 500, we can see a large proportion reside on the left side of the 0.50 average of the binary, which causes a positive correlation with percentage change and the mean of the binary. There is a positive correlation with a slope of 3.0771. Meaning that the closer the occ2010 is to minimum wage, the smaller the percentage decrease the occupation would have seen. This is “Figure 9” in the Google Doc.
#### Taking occupation counts to create percentages of labor force for each year during 2007-2010
recession_counts <- org %>%
filter(year >= 2007 & year <= 2010) %>% # Filter for years 2007-2010
group_by(occ2010, year) %>% # Group by occ2010 and year
summarise(count = n(), .groups = "drop") %>% # Count observations
pivot_wider(names_from = year, values_from = count, values_fill = 0) # Pivot to wide format
recession_counts <- org %>%
filter(year >= 2007 & year <= 2010) %>% # Filter for years 2007-2010
group_by(occ2010, year) %>% # Group by occ2010 and year
summarise(count = n(), .groups = "drop") %>% # Count observations
pivot_wider(names_from = year, values_from = count, values_fill = 0) # Pivot to wide format
recession_counts$occ2010 <- ifelse(recession_counts$occ2010 == 9999, NA, recession_counts$occ2010) # Filter out people with no occupation values
# Filter out the N/A from occ2010 == 9999
recession_counts <- recession_counts %>%
filter(!is.na(occ2010))
total_recession <- recession_counts %>%
summarise(across(starts_with("20"), sum, na.rm = TRUE))
recession_counts <- recession_counts %>%
mutate(
percent_2007 = (`2007` / total_recession$`2007`) * 100,
percent_2008 = (`2008` / total_recession$`2008`) * 100,
percent_2009 = (`2009` / total_recession$`2009`) * 100,
percent_2010 = (`2010` / total_recession$`2010`) * 100
)
recession_percent_filt <- recession_counts %>%
filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100)) # Filter for variables of interest
# Take the state level during recession
state_rec_counts <- org_minwage %>%
filter(year >= 2007 & year <= 2010) %>% # Filter for years 2007-2010
group_by(occ2010, year, state) %>% # Group by occ2010 and year
summarise(count = n(), .groups = "drop") %>% # Count observations
pivot_wider(names_from = year, values_from = count, values_fill = 0) # Pivot to wide format
# Filter out the N/A from occ2010 == 9999
state_rec_counts <- state_rec_counts %>%
filter(!is.na(occ2010))
state_total_rec <- state_rec_counts %>%
group_by(state) %>% # Group by state
summarise(across(starts_with("20"), sum, na.rm = TRUE), .groups = "drop")
state_rec <- state_total_rec %>%
group_by(state) %>%
summarise(across(starts_with("20"), sum, na.rm = TRUE))
state_rec_perc <- state_rec_counts %>%
mutate(
percent_2007 = (`2007` / state_rec$`2007`) * 100,
percent_2008 = (`2008` / state_rec$`2008`) * 100,
percent_2009 = (`2009` / state_rec$`2009`) * 100,
percent_2010 = (`2010` / state_rec$`2010`) * 100
)
state_rec_per_filt <- state_rec_perc %>%
filter(occ2010 %in% c(4010, 3800, 4920, 8220, 5810, 6230, 6420, 8740, 7340, 9100))
long_rec <- state_rec_per_filt %>%
pivot_longer(
cols = c("2007", "2008", "2009", "2010", "percent_2007", "percent_2008", "percent_2009", "percent_2010"),
names_to = "key",
values_to = "value"
) %>%
mutate(
year = case_when(
key %in% c("2007", "2008", "2009", "2010") ~ key,
key %in% c("percent_2007", "percent_2008", "percent_2009", "percent_2010") ~ sub("percent_", "", key)
),
variable = case_when(
key %in% c("2007", "2008", "2009", "2010") ~ "counts",
key %in% c("percent_2007", "percent_2008", "percent_2009", "percent_2010") ~ "percent_employment"
)
) %>%
select(-key) %>%
pivot_wider(names_from = variable, values_from = value)
long_rec <- long_rec %>%
mutate(year = as.numeric(year))
mintab <- mintab %>%
mutate(year = as.numeric(year))
mintab_subset <- mintab %>% select(state, year, minwageU)
# Perform a left join to merge the selected column into long_rec
long_rec <- long_rec %>%
left_join(mintab_subset, by = c("year", "state"))
org_minwage <- org_minwage %>%
mutate(year = as.numeric(year))
orgminwage_subset <- org_minwage %>% select(state, year, occ2010, mean_hourwage, statefip)
long_rec <- long_rec %>%
left_join(orgminwage_subset, by = c("year", "state", "occ2010")) %>%
distinct()
long_rec <- long_rec %>%
mutate(
within_150pct = ifelse(mean_hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
)
binary_rec <- long_rec %>%
filter(within_150pct == 1) %>% # Filter rows where the binary variable is 1
group_by(occ2010, year) %>% # Group by occ2010
summarise(count = n(), .groups = "drop") # Count occurrences for each occ2010
binary_total_rec <- binary_rec %>%
group_by(occ2010) %>% # Group by occ2010
summarise(total_count = sum(count), .groups = "drop")
rec_per_loss_y_over_y_subset <- rec_per_loss_y_over_y %>% select(year, occ2010, percentage_change)
long_rec <- long_rec %>%
left_join(rec_per_loss_y_over_y_subset, by = c("year", "occ2010"))
long_rec <- long_rec %>%
filter(!is.na(statefip))
states_strongest_rec <- long_rec %>%
select(occ2010, year, percent_employment, state) %>%
left_join(all_occ_2007_b, by = c("occ2010")) %>%
mutate(employment_within_150 = percent_employment * mean_within_150pct) %>%
select(-percentage_change)
write_xlsx(states_strongest_rec, "states_strongest_rec.xlsx")
Throughout this chunk of code, loads of data manipulation is taking place. To give a short hand explanation, the goal is create state variables of job loss year over year during the period of interest 2007-2010. By creating percentages I can also view which occupations were the highest proportionately be each state and year. This will help me set up data to create a table to demonstrate which states were proportionately hit the hardest in certain occ2010 values due to the recessionary periods.
states_per_change_2007 <- read_excel("employment_diff_results_rec.xlsx")
lowest_diff_2007 <- states_per_change_2007 %>%
arrange(employment_diff) %>% # Sort by employment_diff ascending
head(10) # Take the top 10 rows
# Create a styled table
lowest_diff_2007 %>%
kable("html", caption = "Top 10 Lowest Employment Diff Values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)
| occ2010 | year | percent_employment | state | mean_within_150pct | employment_within_150 | employment_diff |
|---|---|---|---|---|---|---|
| 4920 | 2009 | 1.0167464 | Utah | 0.3780220 | 0.3843525 | -0.3251316 |
| 6230 | 2009 | 3.1952663 | Florida | 0.1165025 | 0.3722566 | -0.2735587 |
| 4920 | 2009 | 1.3579049 | Maryland | 0.3780220 | 0.5133179 | -0.2663049 |
| 4920 | 2010 | 0.3558719 | Utah | 0.3780220 | 0.1345274 | -0.2498251 |
| 4920 | 2008 | 1.0224274 | Nevada | 0.3780220 | 0.3865000 | -0.2225721 |
| 4920 | 2009 | 2.2380291 | California | 0.3780220 | 0.8460242 | -0.2219835 |
| 4920 | 2010 | 2.5787966 | Florida | 0.3780220 | 0.9748418 | -0.2129728 |
| 5810 | 2009 | 1.2874155 | California | 0.2327531 | 0.2996499 | -0.2060117 |
| 4010 | 2009 | 0.6300630 | Nevada | 0.5446339 | 0.3431537 | -0.2002988 |
| 4010 | 2009 | 0.5684627 | Ohio | 0.5446339 | 0.3096040 | -0.1978614 |
Here I create a table with the mutated data from the block above this one. The percent changes are calculated in Python for ease of calculations and then are read right back into R. From the table produced, we can see that there is a trend in which the occ2010 code 4920 takes 6/10 spots on the table. The code 4920 is associated with housing brokers, and due to the financial crisis of the housing market bubble popping this makes sense. Many housing brokers during this period lost their jobs due to the market failure. Utah is on this table twice for the same occ2010 codes for back to back years in 2009 and 2010, displaying that the housing market took a large hit for multiple years in Utah.
This is labeled as “Table 3,” in the Google Doc Write up.
####State Graphs for the 2007-2010 Recession Period
state_mapping <- data.frame(
statefip = c(1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51,
53, 54, 55, 56),
state_name = c(
"Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
"Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia",
"Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
"Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
"Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota",
"Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island",
"South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
"Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
)
)
statefip_rec <- long_rec %>%
select(statefip) %>%
distinct() %>%
mutate(statefip = as.character(statefip))
long_rec <- long_rec %>%
left_join(state_mapping %>% mutate(statefip_rec = as.character(statefip)), by = "statefip")
create_state_plots1 <- function(long_rec, output_dir = "state_plots_rec") {
# Create output directory if it doesn't exist
full_output_dir <- file.path(getwd(), output_dir)
if (!dir.exists(full_output_dir)) {
dir.create(full_output_dir)
}
# Ensure statefip is a character in long_covid (if necessary)
long_rec <- long_rec %>% mutate(statefip = as.character(statefip))
# Get unique statefip codes
statefips <- unique(long_rec$statefip)
# Iterate over each statefip and create a plot
for (statefip_code in statefips) {
# Filter data for the specific statefip
state_data <- long_rec %>% filter(statefip == statefip_code)
# Get the state name corresponding to the statefip
state_name <- unique(state_data$state_name)
# Check for any duplicate rows
if (nrow(state_data) > 0) {
# Generate the ggplot for each state
p <- ggplot(state_data, aes(x = year, y = percent_employment, fill = as.factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = paste("Percent Employment in State", state_name),
x = "Year",
y = "Percent Employment",
fill = "Occupation"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 8)
)
# Save the plot to the output directory with statefip-specific file names
ggsave(filename = file.path(full_output_dir, paste0("state_", statefip_code, ".png")), plot = p, width = 10, height = 6)
} else {
warning(paste("No data found for statefip:", statefip_code))
}
}
}
create_state_plots1(long_rec, output_dir = "state_plots_rec")
This code creates individual maps for each state that is in the dataset by their statefip codes. The graphs depict the percent employment of the occupations of interest that were chosen earlier on for the 2007-2010 recession period. This graphs provide a visual representation for anyone who is curious to see which states had the highest percent employment for these occupations (spoiler one of them is California). The way these ratios are calculated comes from earlier code which takes the total count of occ2010 values by state and year and divides the occ2010 variables of interest by the total count to give a percentage. The function created iterates with an increase in the statefip every time until reaching the final at statefip = 56 (Wyoming) and then outputs the graphs into a new folder in the working directory. (To view these graphs the code will have to be ran on your own).
The Utah graph was used in the Google Doc write up and was labeled “Figure 10.”
#### COVID-19 recession
covid_period <- data %>%
filter(year >= 2019 & year <= 2022) %>%
filter(count > 500)
covid_per_loss <- covid_period %>%
filter(year >= 2019 & year <= 2022) %>%
filter(percentage_change <= -13.83)
covid_loss <- data %>%
filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
filter(year >= 2019 & year <= 2022)
#filter(occ2010 %in% c(800, 2200, 3500, 3650, 3930, 4010, 4030, 4110, 4230, 4600, 4720, 5940, 6420, 6050, 5200, 7200, 7700, 8965, 9600, 4250)) %>%
covid_per_loss_y_over_y <- covid_period %>%
filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
filter(year >= 2019 & year <= 2022)
Here the data is manipulated for the COVID-19 period, 2019-2022. The data is filtered the same way by taking occ2010 with counts over 500 and then hand picking the top 10 highest percentage_changes on the negative side.
ggplot(covid_loss, aes(x = factor(year), y = percentage_change, fill = factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") + # Bar chart with dodge positioning
labs(
title = "Percentage Change by occ2010 Over Years",
x = "Years",
y = "Percentage Change",
fill = "occ2010"
) +
theme_minimal() + # Use a clean theme
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
avg_perc_covid <- covid_loss %>%
group_by(occ2010) %>% # Group by occ2010 and year
summarize(avg_percentage_change = mean(percentage_change))# Compute the mean, handling NAs
ggplot(avg_perc_covid, aes(x = factor(occ2010), y = avg_percentage_change, fill = factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") + # Bar chart with dodge positioning
labs(
title = "Average Percentage Change by occ2010\nOver COVID-19 Period (2019-2022)",
x = "occ2010",
y = "Average Percentage Change",
fill = "occ2010"
) +
theme_minimal() + # Use a clean theme
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
occ_lookup_covid <- data.frame(
occ2010 = c(2200, 3500, 3650, 4010, 4030, 4110, 4600, 4720, 5940, 7200),
occupation_name = c("Postsecondary Teachers", "Licensed Practical and Licensed Vocational Nurses", "Medical Assistants and Other Healthcare Support Occupations", "First-Line Supervisors of Food Preparation and Serving Workers", "Food Preparation Workers", "Waiters and Waitresses", "Childcare Workers", "Cashiers", "Office and administrative support workers (Farming, Fishing, and Forestry)", "Automotive Service Technicians and Mechanics")
)
occ_table_covid <- occ_lookup_covid %>%
select(occ2010, occupation_name) %>% # Select relevant columns
arrange(occ2010) # Sort by occ2010 if desired
# Create a styled HTML table
occ_table_covid %>%
kable("html", caption = "Occupation Codes and Names for 2019-2022 Recession") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)
| occ2010 | occupation_name |
|---|---|
| 2200 | Postsecondary Teachers |
| 3500 | Licensed Practical and Licensed Vocational Nurses |
| 3650 | Medical Assistants and Other Healthcare Support Occupations |
| 4010 | First-Line Supervisors of Food Preparation and Serving Workers |
| 4030 | Food Preparation Workers |
| 4110 | Waiters and Waitresses |
| 4600 | Childcare Workers |
| 4720 | Cashiers |
| 5940 | Office and administrative support workers (Farming, Fishing, and Forestry) |
| 7200 | Automotive Service Technicians and Mechanics |
Here the same pattern of bar charts are created as seen during the recessionary period. The first graph displays the changes year over year for the occupations of interest. While the second graph shows the average between the 4 years and the occ2010 percentage changes.
The graph labeled, “Percentage Change by occ2010 Over Years” takes the percentage change of the occ2010 codes of interest and shows the percentage changes year over year for the years of interest. This is labeled as “Figure 13” in the Google Doc.
The graph labeled, “Average Percentage Change by occ2010 Over COVID-19 Period (2019-2022)” takes the average percentage change of the occ2010 codes of interest and shows the percentage changes over the years of interest. This is labeled as “Figure 14” in the Google Doc.
The table labeled, “Occupation Codes and Names for 2019-2022 Recession” provides the occupation codes of interest during the COVID-19 crisis, and is labeled as “Figue 15” in the Google Doc.
### Scatter Plot for Covid to Display Minimum Wage Along Percentage Decrease of occ2010
org_minwage_2019 <- org_minwage%>%
group_by(year, occ2010, statecd) %>%
filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) %>%
filter(year >= 2019 & year <= 2022) %>%
mutate(
within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
) %>%
select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
filter(!is.na(within_150pct)) %>%
filter(!is.na(hourwage))
org_minwage_2019_grouped <- org_minwage_2019 %>%
group_by(occ2010) %>% # Group by occ2010
summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop") # Calculate the mean
org_minwage_2019_plot <- org_minwage_2019_grouped %>%
left_join(covid_per_loss, by = "occ2010")
The dataframes are joined with one another following the same rules as the recessionary periods.
ggplot_base <- ggplot(org_minwage_2019_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) + # Adds hover text
labs(
title = "Scatter Plot of Percentage Change vs. Mean Within 150%\n(COVID-19 Period)",
x = "Mean Within 150%",
y = "Percentage Change",
color = "Occupation (occ2010)"
) +
scale_y_reverse() +
theme_minimal()
# Convert to interactive plotly plot
interactive_plot <- ggplotly(ggplot_base, tooltip = "text") # Tooltip displays the text defined in aes
# Display the plot
interactive_plot
fit_specific_covid <- lm(percentage_change ~ mean_within_150pct, data = org_minwage_2019_plot)
slope_specific_covid <- coef(fit_specific_covid)[2]
Specific_Covid_Reg <- list("Linear Model"=fit_specific_covid)
modelsummary(Specific_Covid_Reg,
coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
stars=T, gof_map = c("nobs", "r.squared"),
title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession for only Occupations of Interest",
output="default")
| Linear Model | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| Within 150% of Minimum Wage | 0.024 |
| (3.787) | |
| Num.Obs. | 11 |
| R2 | 0.000 |
ggplot_base <- ggplot(org_minwage_2019_plot, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
geom_smooth(aes(text = paste("Slope:", round(slope_specific_covid, 4))),
method = "lm", se = FALSE, color = "black", linetype = "solid") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022) for occ2010 of Interest",
x = "Mean Within 150%",
y = "Percentage Change") +
scale_y_reverse() +
guides(color = "none") +
theme_minimal()
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot
The graphs created here show the occupations of interest that were used within the bar chart graphs. The y-axis on the first graph is spread out to give a better visualization of the differences between the data. The second graph contains the regression line which has a slope of 0.024 indicating a positive relationship between a growing percentage change as the occ2010 is closer to the minimum wage. However, the coefficient is very small and not statistically significant.
The graph labeled, “Scatter Plot of Percentage Change vs. Mean Within 150% (COVID-19 Period)” displays the highest negative percentage change on the y-axis, and shows the percent of the occupation total within the minimum wage threshold of low wage on the x-axis, this is labeled as “Figure 16” on the Google Doc.
The graph labeled, “Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022) for occ2010 of Interest” displays the highest negative percentage change on the y-axis, and shows the percent of the occupation total within the minimum wage threshold of low wage on the x-axis, the regression line shows the best fit based on the graph. This is labeled as “Figure 17” on the Google Doc.
The table labeled, “Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession for only Occupations of Interest” displays the regression results which is used to fit the line for Figure 17. The slope is not statistically significant and has too low of an R^2 to be considered significant. This is labeled as “Table 4” in the Google Doc.
all_occ_2019 <- org_minwage %>%
group_by(year, occ2010, statecd) %>%
filter(year >= 2019 & year <= 2022) %>%
mutate(
within_150pct = ifelse(hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
) %>%
select(-cpsid,-month,-pernum,-wtfinl,-cpsidp,-earnweek2,-hourwage2,-age,-sex,-race,-nativity,-hispan,-empstat,-ind1990,-educ,-schlcoll,-earnweek,-earnwt,-educ_level) %>%
filter(!is.na(within_150pct)) %>%
filter(!is.na(hourwage))
all_occ_2019_a <- all_occ_2019 %>%
group_by(occ2010, year) # Group by occ2010
all_occ_2019_a$occ2010 <- ifelse(all_occ_2019_a$occ2010 == 9999.99, NA, all_occ_2019_a$occ2010)
all_occ_2019_binary <- all_occ_2019 %>%
group_by(occ2010) %>%
summarise(mean_within_150pct = mean(within_150pct, na.rm = TRUE), .groups = "drop") # Calculate the mean
total_per_change_rec <- data %>%
filter(year >= 2019 & year <= 2022) %>% # Filter data by year range first
filter(count >= 500) %>%
group_by(occ2010) %>% # Group by year and occ2010
filter(percentage_change == min(percentage_change, na.rm = TRUE)) %>% # Keep rows with the smallest percentage_change
ungroup()
all_occ_2019_b <- all_occ_2019_binary %>%
left_join(total_per_change_rec %>% select(occ2010, percentage_change),
by = c("occ2010")) %>%
filter(!is.na(percentage_change))
The data is manipulated to encompass the COVID-19 recession and for counts for 500. The binary variable is again created and the average is taken and set with the occ2010 value to give a percentage of the occupation observations that were closest to the minimum wage threshold of 150%.
fit_covid <- lm(percentage_change ~ mean_within_150pct, data = all_occ_2019_b)
slope_covid <- coef(fit_covid)[2]
Covid_Reg <- list("Linear Model"=fit_covid)
modelsummary(Covid_Reg,
coef_map = c("percentage_change"="Greatest Negative Percentage Change", "mean_within_150pct"="Within 150% of Minimum Wage"),
stars=T, gof_map = c("nobs", "r.squared"),
title = "Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession",
output="default")
| Linear Model | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| Within 150% of Minimum Wage | -8.753* |
| (3.334) | |
| Num.Obs. | 93 |
| R2 | 0.070 |
ggplot_base <- ggplot(all_occ_2019_b, aes(x = mean_within_150pct, y = percentage_change, color = as.factor(occ2010))) +
geom_point(aes(text = paste("Percent of occ2010 within 150% minimum wage:", mean_within_150pct)), size = 3) +
geom_smooth(aes(text = paste("Slope:", round(slope_covid, 4))),
method = "lm", se = FALSE, color = "black", linetype = "solid") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Scatter Plot of Percentage Change vs. Mean Within 150% (2019-2022)",
x = "Mean Within 150%",
y = "Percentage Change") +
scale_y_reverse() +
guides(color = "none") +
theme_minimal()
interactive_plot <- ggplotly(ggplot_base, tooltip = "text")
interactive_plot
The graph is displayed for the total amount of occ2010 that had a count over 500 during this time frame of 2019-2022. The COVID-19 recession has a slope of -8.7534 and is statistically significant which is shown in the linear model regression. This means that as the occ2010 gets closer to a 100% value for being near the minimum wage, these jobs experienced a higher percentage change loss than jobs that were not near the minimum wage threshold. This is labeled as “Figure 18” in the Google Doc.
The table labeled, “Percentage Change of Employment based on being within 150% of Minimum Wage during the 2019-2022 Recession” displays the results for the regression data that is plotted in Figure 18. The line is statistically significant with the highest R^2 value displayed throughout all the tables. This table is labeled as “Table 5.”
#### Taking occupation counts to create percentages of labor force for each year during 2019-2022
national_covid_counts <- org %>%
filter(year >= 2019 & year <= 2022) %>% # Filter for years 2007-2010
group_by(occ2010, year) %>% # Group by occ2010 and year
summarise(count = n(), .groups = "drop") %>% # Count observations
pivot_wider(names_from = year, values_from = count, values_fill = 0) # Pivot to wide format
national_covid_counts$occ2010 <- ifelse(national_covid_counts$occ2010 == 9999, NA, national_covid_counts$occ2010) # Filter out people with no occupation values
# Filter out the N/A from occ2010 == 9999
national_covid_counts <- national_covid_counts %>%
filter(!is.na(occ2010))
national_total_covid <- national_covid_counts %>%
summarise(across(starts_with("20"), sum, na.rm = TRUE))
national_covid_counts <- national_covid_counts %>%
mutate(
percent_2019 = (`2019` / national_total_covid$`2019`) * 100,
percent_2020 = (`2020` / national_total_covid$`2020`) * 100,
percent_2021 = (`2021` / national_total_covid$`2021`) * 100,
percent_2022 = (`2022` / national_total_covid$`2022`) * 100
)
national_covid_percent_filt <- national_covid_counts %>%
filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) # Filter for variables of interest
# Take total by states, years for occupation to show state based labor participation during COVID-19
state_covid_counts <- org_minwage %>%
filter(year >= 2019 & year <= 2022) %>% # Filter for years 2007-2010
group_by(occ2010, year, state) %>% # Group by occ2010 and year: , minwageU, mean_hourwage
summarise(count = n(), .groups = "drop") %>% # Count observations
pivot_wider(names_from = year, values_from = count, values_fill = 0) # Pivot to wide format
state_covid_counts <- state_covid_counts %>%
filter(!is.na(occ2010))
state_total_covid <- state_covid_counts %>%
group_by(state) %>% # Group by state
summarise(across(starts_with("20"), sum, na.rm = TRUE), .groups = "drop")
state_covid <- state_total_covid %>%
group_by(state) %>%
summarise(across(starts_with("20"), sum, na.rm = TRUE))
state_covid_perc <- state_covid_counts %>%
mutate(
percent_2019 = (`2019` / state_covid$`2019`) * 100,
percent_2020 = (`2020` / state_covid$`2020`) * 100,
percent_2021 = (`2021` / state_covid$`2021`) * 100,
percent_2022 = (`2022` / state_covid$`2022`) * 100
)
state_covid_per_filt <- state_covid_perc %>%
filter(occ2010 %in% c(4030, 2200, 3500, 5940, 7200, 4720, 3650, 4110, 4010, 4600)) #%>% # Filter for specific occ2010
long_covid <- state_covid_per_filt %>%
pivot_longer(
cols = c("2019", "2020", "2021", "2022", "percent_2019", "percent_2020", "percent_2021", "percent_2022"),
names_to = "key",
values_to = "value"
) %>%
mutate(
year = case_when(
key %in% c("2019", "2020", "2021", "2022") ~ key,
key %in% c("percent_2019", "percent_2020", "percent_2021", "percent_2022") ~ sub("percent_", "", key)
),
variable = case_when(
key %in% c("2019", "2020", "2021", "2022") ~ "counts",
key %in% c("percent_2019", "percent_2020", "percent_2021", "percent_2022") ~ "percent_employment"
)
) %>%
select(-key) %>%
pivot_wider(names_from = variable, values_from = value)
long_covid <- long_covid %>%
mutate(year = as.numeric(year))
mintab <- mintab %>%
mutate(year = as.numeric(year))
mintab_subset <- mintab %>% select(state, year, minwageU)
# Perform a left join to merge the selected column into df1
long_covid <- long_covid %>%
left_join(mintab_subset, by = c("year", "state"))
org_minwage <- org_minwage %>%
mutate(year = as.numeric(year))
orgminwage_subset <- org_minwage %>% select(state, year, occ2010, mean_hourwage, statefip)
long_covid <- long_covid %>%
left_join(orgminwage_subset, by = c("year", "state", "occ2010")) %>%
distinct()
long_covid <- long_covid %>%
mutate(
within_150pct = ifelse(mean_hourwage <= 1.5 * minwageU, 1, 0) # Create binary variable
)
binary_covid <- long_covid %>%
filter(within_150pct == 1) %>% # Filter rows where the binary variable is 1
group_by(occ2010, year) %>% # Group by occ2010
summarise(count = n(), .groups = "drop") # Count occurrences for each occ2010
binary_total_covid <- binary_covid %>%
group_by(occ2010) %>% # Group by occ2010
summarise(total_count = sum(count), .groups = "drop")
covid_per_loss_y_over_y_subset <- covid_per_loss_y_over_y %>% select(year, occ2010, percentage_change)
long_covid <- long_covid %>%
left_join(covid_per_loss_y_over_y_subset, by = c("year", "occ2010"))
long_covid <- long_covid %>%
filter(!is.na(statefip))
statefip_covid <- long_covid %>%
select(statefip) %>%
distinct() %>%
mutate(statefip = as.character(statefip))
long_covid <- long_covid %>%
left_join(state_mapping %>% mutate(statefip_covid = as.character(statefip)), by = "statefip")
Throughout this chunk of code, loads of data manipulation is taking place. To give a short hand explanation, the goal is create state variables of job loss year over year during the period of interest 2019-2022. By creating percentages I can also view which occupations were the highest proportionately be each state and year. This will help me set up data to create a table to demonstrate which states were proportionately hit the hardest in certain occ2010 values due to the recessionary periods.
This is labeled as “Table 6,” in the Google Doc Write up.
####Table of States that Saw the Largest Decreases in occ2010 Variables of Interest
states_strongest_covid <- long_covid %>%
select(occ2010, year, percent_employment, state) %>%
left_join(all_occ_2019_b, by = c("occ2010")) %>%
mutate(employment_within_150 = percent_employment * mean_within_150pct) %>%
select(-percentage_change)
write_xlsx(states_strongest_covid, "states_strongest_covid.xlsx")
states_per_change_2019 <- read_excel("employment_diff_results_covid.xlsx")
lowest_diff_2019 <- states_per_change_2019 %>%
arrange(employment_diff) %>% # Sort by employment_diff ascending
head(10) # Take the top 10 rows
# Create a styled table
lowest_diff_2019 %>%
kable("html", caption = "Top 10 Lowest Employment Diff Values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE)
| occ2010 | year | percent_employment | state | mean_within_150pct | employment_within_150 | employment_diff |
|---|---|---|---|---|---|---|
| 4720 | 2020 | 10.580645 | California | 0.6937201 | 7.3400064 | -2.7591632 |
| 4030 | 2020 | 4.922725 | California | 0.6842271 | 3.3682618 | -1.6726315 |
| 4720 | 2020 | 6.482364 | Florida | 0.6937201 | 4.4969465 | -1.1009686 |
| 4720 | 2022 | 5.233645 | Alabama | 0.6937201 | 3.6306847 | -0.9981237 |
| 4720 | 2020 | 1.643836 | Maryland | 0.6937201 | 1.1403618 | -0.9915377 |
| 4720 | 2022 | 2.475248 | Maryland | 0.6937201 | 1.7171290 | -0.9856767 |
| 4720 | 2020 | 1.813685 | New Mexico | 0.6937201 | 1.2581898 | -0.8677267 |
| 4110 | 2020 | 1.262433 | Tennessee | 0.7857523 | 0.9919596 | -0.8440442 |
| 4720 | 2020 | 8.879392 | Texas | 0.6937201 | 6.1598130 | -0.8391437 |
| 4720 | 2021 | 3.234237 | New York | 0.6937201 | 2.2436554 | -0.8390301 |
Here I create a table with the mutated data from the block above this one. The percent changes are calculated in Python for ease of calculations and then are read right back into R. From the table produced, we can see that California and Florida hold the top 3 spots for seeing the strongest decrease in 2 occupation groups, 4720 and 4030 during the COVID-19 recession. These two states also happen to be 2 of the most populated states. A common trend throughout the table is that the occupation 4720 (cashiers) saw the largest decrease in 8/10 observations. The employment_within_150 is calculated by the percent_employment multiplied by the mean_within_150pct to give a value of the percent of employment for that occ2010 observation that is within the minimum wage range.
####COVID-19 State Plots of Percentage Employment
create_state_plots <- function(long_covid, output_dir = "state_plots_covid") {
# Create output directory if it doesn't exist
full_output_dir <- file.path(getwd(), output_dir)
if (!dir.exists(full_output_dir)) {
dir.create(full_output_dir)
}
# Ensure statefip is a character in long_covid (if necessary)
long_covid <- long_covid %>% mutate(statefip = as.character(statefip))
# Get unique statefip codes
statefips <- unique(long_covid$statefip)
# Iterate over each statefip and create a plot
for (statefip_code in statefips) {
# Filter data for the specific statefip
state_data <- long_covid %>% filter(statefip == statefip_code)
# Get the state name corresponding to the statefip
state_name <- unique(state_data$state_name)
# Check for any duplicate rows
if (nrow(state_data) > 0) {
# Generate the ggplot for each state
p <- ggplot(state_data, aes(x = year, y = percent_employment, fill = as.factor(occ2010))) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = paste("Percent Employment in State", state_name),
x = "Year",
y = "Percent Employment",
fill = "Occupation"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 8)
)
# Save the plot to the output directory with statefip-specific file names
ggsave(filename = file.path(full_output_dir, paste0("state_", statefip_code, ".png")), plot = p, width = 10, height = 6)
} else {
warning(paste("No data found for statefip:", statefip_code))
}
}
}
create_state_plots(long_covid, output_dir = "state_plots_covid")
This code creates individual maps for each state that is in the dataset by their statefip codes. The graphs depict the percent employment of the occupations of interest that were chosen earlier on for the 2019-2022 recession period. This graphs provide a visual representation for anyone who is curious to see which states had the highest percent employment for these occupations (spoiler one of them is California). The way these ratios are calculated comes from earlier code which takes the total count of occ2010 values by state and year and divides the occ2010 variables of interest by the total count to give a percentage. The function created iterates with an increase in the statefip every time until reaching the final at statefip = 56 (Wyoming) and then outputs the graphs into a new folder in the working directory. (To view these graphs the code will have to be ran on your own).
The California graph was used in the Google Doc write up and was labeled “Figure 19.”